rough outline

2022-04-25

Introduction

Packages Required

Data Preparation

The data we will be looking at:

  1. Deaths due to Air Pollution [https://www.kaggle.com/datasets/akshat0giri/death-due-to-air-pollution-19902017]
  1. World Population(pending)

First let’s import the Deaths by Air Pollution data.

deaths_df <- data.frame(read.csv("death-rates-from-air-pollution.csv"))

We are going to rename a few of the columns and view the first 6 rows of the data

colnames(deaths_df) <- c("country", "acronym", "year", "total_deaths", "indoor_deaths", "outdoor_deaths", "ozone_deaths")

head(deaths_df)
##       country acronym year total_deaths indoor_deaths outdoor_deaths
## 1 Afghanistan     AFG 1990     299.4773      250.3629       46.44659
## 2 Afghanistan     AFG 1991     291.2780      242.5751       46.03384
## 3 Afghanistan     AFG 1992     278.9631      232.0439       44.24377
## 4 Afghanistan     AFG 1993     278.7908      231.6481       44.44015
## 5 Afghanistan     AFG 1994     287.1629      238.8372       45.59433
## 6 Afghanistan     AFG 1995     288.0142      239.9066       45.36714
##   ozone_deaths
## 1     5.616442
## 2     5.603960
## 3     5.611822
## 4     5.655266
## 5     5.718922
## 6     5.739174

Variables that interest us here include: * country * total_deaths * indoor_deaths * outdoor_deaths * ozone_deaths

Now, we are going to import the World Population data

world_pop <- read.csv("population_total_long.csv")
glimpse(world_pop)
## Rows: 12,595
## Columns: 3
## $ Country.Name <chr> "Aruba", "Afghanistan", "Angola", "Albania", "Andorra", "~
## $ Year         <int> 1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960, 196~
## $ Count        <int> 54211, 8996973, 5454933, 1608800, 13411, 92418, 20481779,~

Let’s split the data into high and low population based on country. - Low population = high population * .10

#Let's compare the first decade 1996-2006 with second decade 2007-2017 
high_pop_countries <- world_pop %>% 
  filter(Year > 1996 & Country.Name %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>% 
  group_by(Year)

high_pop_countries
## # A tibble: 126 x 3
## # Groups:   Year [21]
##    Country.Name   Year     Count
##    <chr>         <int>     <int>
##  1 Australia      1997  18517000
##  2 Brazil         1997 167209040
##  3 Germany        1997  82034771
##  4 Nigeria        1997 113457663
##  5 Pakistan       1997 131057431
##  6 United States  1997 272657000
##  7 Australia      1998  18711000
##  8 Brazil         1998 169785250
##  9 Germany        1998  82047195
## 10 Nigeria        1998 116319759
## # ... with 116 more rows
low_pop_countries <- world_pop %>% 
  filter(Year > 1996 & Country.Name %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand'))%>% 
  group_by(Year)

low_pop_countries
## # A tibble: 126 x 3
## # Groups:   Year [21]
##    Country.Name  Year    Count
##    <chr>        <int>    <int>
##  1 Canada        1997 29905948
##  2 Chile         1997 14786220
##  3 Sri Lanka     1997 18470900
##  4 Malawi        1997 10264906
##  5 New Zealand   1997  3781300
##  6 Serbia        1997  7596501
##  7 Canada        1998 30155173
##  8 Chile         1998 14977733
##  9 Sri Lanka     1998 18564599
## 10 Malawi        1998 10552338
## # ... with 116 more rows

After selecting our desired years, we’ve cut the population data set nearly in half.

To get a general idea of the data, let’s make some plots to see what’s happening.

d <- ggplot(deaths_df, aes(x = indoor_deaths, y = outdoor_deaths, text = paste0(country, ", ", year) )) + geom_point()
ggplotly(d)
#p <- ggplot(world_pop, aes(x = Country.Name, y = Year,  text = paste0(Country.Name, ", ", Year) )) + geom_density()
#ggplotly(p)

This is a mess, and so we chose two countries from each continent (a high-population and a low-population country) to graph.

#Mean total deaths from 1990-2017 of high-population countries
deaths_highpop_countries <- deaths_df %>% 
  filter(country %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>% 
  group_by(country) %>% 
  select(total_deaths) %>% 
  summarize(average_death_high = mean(total_deaths))
## Adding missing grouping variables: `country`
#deaths_highpop_countries


#Mean total deaths from 1990-2017 of high-population countries
deaths_lowpop_countries<- deaths_df %>% 
  filter(country %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand')) %>% 
  group_by(country) %>% 
  select(total_deaths) %>% 
  summarize(average_death_low = mean(total_deaths))
## Adding missing grouping variables: `country`
#death_lowpop_countries
kable(list(deaths_highpop_countries, deaths_lowpop_countries))
country average_death_high
Australia 17.76815
Brazil 48.42928
Germany 28.10988
Nigeria 112.30157
Pakistan 144.33463
United States 26.35827
country average_death_low
Canada 18.18542
Chile 36.51321
Malawi 147.77167
New Zealand 15.92536
Serbia 80.66558
Sri Lanka 69.60383
kable(deaths_highpop_countries) %>% 
  kable_styling(full_width = FALSE, position = 'float_left')
country average_death_high
Australia 17.76815
Brazil 48.42928
Germany 28.10988
Nigeria 112.30157
Pakistan 144.33463
United States 26.35827
kable(deaths_lowpop_countries) %>% 
  kable_styling(full_width = FALSE, position = 'left')
country average_death_low
Canada 18.18542
Chile 36.51321
Malawi 147.77167
New Zealand 15.92536
Serbia 80.66558
Sri Lanka 69.60383
ggplot(deaths_highpop_countries)+
  geom_col(mapping = aes(x=country, y=average_death_high))+
             xlab("Country")+
             ylab("Average deaths (per 100,000)")+
             ggtitle("Average total deaths in high-population countries")+
  coord_flip()

ggplot(deaths_lowpop_countries)+
  geom_col(mapping = aes(x=country, y=average_death_low))+
             xlab("Country")+
             ylab("Average deaths (per 100,000)")+
             ggtitle("Average total deaths in low-population countries")+
  coord_flip()

This shows us the deaths due to pollution, but what about the average population of those countries at that time?

hp_countries_population <- world_pop %>% 
  filter(Country.Name %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>% 
  group_by(Country.Name) %>% 
  select(Count) %>% 
  summarize(average_population = mean(Count))
## Adding missing grouping variables: `Country.Name`
#hp_countries_population

lp_countries_population <- world_pop %>% 
  filter(Country.Name %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand')) %>% 
  group_by(Country.Name) %>% 
  select(Count) %>% 
  summarize(average_population = mean(Count))
## Adding missing grouping variables: `Country.Name`
#lp_countries_population

kable(list(hp_countries_population, lp_countries_population))
Country.Name average_population
Australia 16825202
Brazil 142954579
Germany 79431999
Nigeria 99902237
Pakistan 110976559
United States 251243424
Country.Name average_population
Canada 27180819
Chile 13125858
Malawi 8997793
New Zealand 3477179
Serbia 7420310
Sri Lanka 16356619
kable(hp_countries_population) %>% 
  kable_styling(full_width = FALSE, position = 'float_left')
Country.Name average_population
Australia 16825202
Brazil 142954579
Germany 79431999
Nigeria 99902237
Pakistan 110976559
United States 251243424
kable(lp_countries_population) %>% 
  kable_styling(full_width = FALSE, position = 'left')
Country.Name average_population
Canada 27180819
Chile 13125858
Malawi 8997793
New Zealand 3477179
Serbia 7420310
Sri Lanka 16356619
ggplot(hp_countries_population)+
  geom_col(mapping = aes(x=Country.Name, y=average_population))+
             xlab("Country")+
             ylab("Average Population")+
             ggtitle("Average high-population countries")+
  coord_flip()

ggplot(lp_countries_population)+
  geom_col(mapping = aes(x=Country.Name, y=average_population))+
             xlab("Country")+
             ylab("Average Population")+
             ggtitle("Average low-population countries")+
  coord_flip()

#Join the data sets so we can overlay the two graph or do a stacked barchart?

Exploratory Data Analysis

Summary